In [1]:
import numpy as np
In [2]:
import os
import zipfile
import requests
from sklearn.svm import LinearSVC
import numpy.matlib
from yellowbrick.classifier import ROCAUC
In [3]:
def add_column(data_array,new_column):
nc = numpy.reshape(new_column,(-1,1))
if data_array is None:
return nc
else:
return numpy.concatenate((data_array,nc),axis=1)
In [4]:
def add_categorical(data_array,new_column,datatype):
nc = numpy.reshape(new_column,(-1,1))
unq = numpy.unique(new_column)
nrows = numpy.size(new_column)
new_columns = None
for y in unq:
newcol = numpy.zeros((nrows,1),dtype=datatype)
newcol[numpy.where(nc==y)] = 1
if new_columns is None:
new_columns = newcol
else:
new_columns = numpy.concatenate((new_columns,newcol),axis=1)
if data_array is None:
return new_columns
else:
return numpy.concatenate((data_array,new_columns),axis=1)
In [5]:
creditdata_info = {
'url': 'https://s3.amazonaws.com/ddl-data-lake/yellowbrick/credit.zip',
'signature': '4a91339c69f55e18f3f48004328fbcb7868070b618208fed099920427b084e5e',
'type': 'numpy',
}
In [6]:
path = os.getcwd()
response = requests.get(creditdata_info['url'], stream=True)
dlpath = 'creditdata.zip'
with open(dlpath, 'wb') as f:
for chunk in response.iter_content(65536):
f.write(chunk)
In [7]:
zf = zipfile.ZipFile(dlpath)
zf.extractall(path)
In [8]:
credittxt_path = os.path.join(path,'credit','credit.csv')
credit = np.genfromtxt(credittxt_path, dtype=float, delimiter=',', names=True)
In [9]:
credit_keys = credit.dtype.names
datatype = credit.dtype[0]
ncols = len(credit_keys)
categorical_names = ['edu','married']
y_name = 'default'
credit_data = None
for j in range(0,ncols):
if credit_keys[j] in categorical_names:
credit_data = add_categorical(credit_data,credit[credit_keys[j]],datatype)
elif credit_keys[j] == y_name:
y = credit[y_name].astype(int)
else:
credit_data = add_column(credit_data,credit[credit_keys[j]])
In [10]:
#Rescale the dataset by subtracting the mean from each column and dividing by the standard deviation
datashape = credit_data.shape
nrows = datashape[0]
cmeans = np.mean(credit_data,0)
repmeans = numpy.matlib.repmat(cmeans,nrows,1)
mydata = credit_data - repmeans
sstds = np.std(mydata,0)
repstds = numpy.matlib.repmat(sstds,nrows,1)
mydata = np.divide(mydata,repstds)
In [11]:
visualizer = ROCAUC(LinearSVC())
visualizer.fit(mydata,y)
visualizer.score(mydata,y)
visualizer.show()
In [ ]: